A general framework for unbiased mixed variables distance

supplementary material

Here we provide the code to reproduce the results of the supplementary material of the paper “Unbiased mixed variables distances”.

Data generation

These are the simulation parameers

Code
reps<-100  # 100 can take 30min
n<-500
k<-2 # Dimension solution
sigma<-0.03 # Noise

porig<-6 # Number of variables corresponding to true config
pnnoise<-0 # Number of numerical noise vars
pcnoise<-0 # Number of categorical noise vars
pn<-2 # Number of numerical variables underying config

pnum<-pn+pnnoise+pcnoise  # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
p<-porig+pnnoise+pcnoise # Total number of variables
qoptions<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
pcat<-length(qoptions) # Number of cat variables underlying config


methods = c("baseline","naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")
method_levels <- c("baseline", "naive", "hl", "hl_add", "gower","gudmm", "dkps", "mod_gower", "u_ind", "u_dep", "u_mix")


my_colors <- c(
  # Set B
  "baseline"    = "#8C564B",  # brown
  "naive"       = "#E377C2",  # pink
  "hl"          = "#9C9E00",  # darker yellow-green (better contrast)
  "gudmm"       = "#FF7F0E",  # orange
  "dkps"        = "#7F7F7F",  # darker grey (more visible)

  # Set A
  "hl_add"      = "#9EC5E5",  # light blue
  "u_ind"       = "#2CA02C",  # green
  "u_dep"       = "#D62728",  # red
  "u_mix" = "#1E90FF",
  "gower"       = "#FFBF00",  # gold
  "mod_gower"   = "#9467BD"   # purple
)

my_shapes <- c(
  "baseline"    = 16,  # filled circle
  "naive"       = 17,  # filled triangle
  "hl"          = 15,  # filled square
  "hl_add"      = 3,   # plus
  "gower"       = 4,   # cross
  "gudmm"       = 18,  # filled diamond
  "dkps"        = 8,   # star
  "mod_gower"   = 0,   # empty square
  "u_ind"       = 1,   # empty circle
  "u_dep"       = 2,    # empty triangle
  "u_mix" = 5 # empty diamond
)

The considered methods are

additive distances

  • baseline: Manhattan distance on all the standardized numerical variables (before discretization)

  • gower: classic Gower distance

  • mod_gower: modified version of the Gower distance, as proposed in Liu et al. (2024)

  • hl_add: Manhattan distance with scaling of indicators as proposed by Hennig and Liao (2013)

  • u_ind: commensurable with simple matching

  • u_dep: commensurable PCA-scaled numerical and total-variance categorical

  • u_mix: commensurable Manhattan numerical and total-variance categorical

non-additive distances

  • naive: Euclidean distance on all variables, with one-hot encoded categorical variables.

  • hl: Euclidean distance with scaling of indicators as proposed by Hennig and Liao (2013)

  • gudmm: Generalized Unified Distance Metric for Mixed-type data as proposed by Mousavi and Sehhati (2023)

  • dkps: Distance using kernel product similarity as proposed by Ghashti and Thompson (2024)

Compute distances: leave one variable out

Code
library(progress)


pb <- progress_bar$new(
  format = "  Simulating [:bar] :percent eta: :eta",
  total = reps * length(methods),
  clear = FALSE,
  width = 60
)

# methods = c("baseline","naive","gower","hl","u_dep","u_ind","gudmm","dkss","mod_gower")


generated_datasets = tibble(replicate=1:reps) |> 
  mutate(dataset = map(.x=replicate, ~generate_dataset(n = n, pn=pn, porig=porig, pnnoise =pnnoise , pcnoise = pcnoise, sigma = sigma, qoptions = qoptions, seed = 1234+.x)),
         method=map(.x=replicate,~methods)
         ) |> unnest(method)|>
  left_join(mdist_method_lookup, by = "method")
Code
## a little helper for mdist

run_mdist_within <- function(df, mdist_type, mdist_preset, param_set, outcome = "response") {
  x <- df

  if (is.character(outcome) && length(outcome) == 1 && outcome %in% names(df)) {
    x <- dplyr::select(df, -dplyr::all_of(outcome))
  }

  if (mdist_type == "preset") {
    out <- manydist::mdist(x = x, preset = mdist_preset)

  } else if (mdist_type == "custom") {
    if (is.null(param_set)) stop("param_set is NULL for mdist_type = 'custom'")
    args <- param_set
    args$x <- x
    out <- do.call(manydist::mdist, args = args)

  } else {
    stop("Unknown mdist_type: ", mdist_type)
  }
if (identical(mdist_type, "preset") && identical(mdist_preset, "gower")) {
  
  out$distance*ncol(x)
  }else{
  out$distance
  }
}

# little helper for the leave one variable out version of mdist
run_lovo <- function(df, mdist_type, mdist_preset, param_set,
                     outcome = NULL, dims = 2, keep_dist = FALSE,
                     legacy_gower_sum = FALSE) {

  # drop outcome if present
  x <- df
  if (is.character(outcome) && length(outcome) == 1 && outcome %in% names(df)) {
    x <- dplyr::select(df, -dplyr::all_of(outcome))
  }

  # ---- special case: legacy Gower SUM scaling for LOVO ----
  if (legacy_gower_sum && identical(mdist_type, "preset") && identical(mdist_preset, "gower")) {

    vars <- names(x)
    p_full <- length(vars)

    # full
    D_full <- manydist::mdist(x = x, preset = "gower")$distance |> as.matrix()
    D_full <- D_full * p_full

    base_mds <- cmdscale(D_full, eig = TRUE, k = dims)$points[, 1:dims, drop = FALSE]
    loo_list <- vector("list", length(vars)); names(loo_list) <- vars

    for (i in seq_along(vars)) {
      var <- vars[i]
      x_sub <- dplyr::select(x, -dplyr::any_of(var))
      p_loo <- ncol(x_sub)

      D_loo <- manydist::mdist(x = x_sub, preset = "gower")$distance |> as.matrix()
      D_loo <- D_loo * p_loo

      loo_list[[i]] <- D_loo
    }

    mad <- vapply(loo_list, function(m) mean(abs(D_full - m)), numeric(1))
    cc  <- vapply(loo_list, function(m) {
      pts <- cmdscale(m, eig = TRUE, k = dims)$points[, 1:dims, drop = FALSE]
      congruence_coeff(base_mds, pts)
    }, numeric(1))
    ac <- sqrt(1 - cc^2)

    res <- tibble::tibble(
      variable       = vars,
      mad_importance = mad,
      cc_importance  = cc,
      ac_importance  = ac,
      mad_normalized = mad / sum(mad)
    )

    out <- list(results = res, base_mds = base_mds)
    if (keep_dist) {
      out$full_dist <- D_full
      out$loo_dist  <- loo_list
    }
    return(out)
  }

  # ---- default path: your package's LOVO ----
  if (mdist_type == "preset") {
    obj <- manydist::lovo_mdist(x = x, preset = mdist_preset, dims = dims, keep_dist = keep_dist)

  } else if (mdist_type == "custom") {
    if (is.null(param_set)) stop("param_set is NULL for mdist_type = 'custom'")
    args <- param_set
    args$x <- x
    args$dims <- dims
    args$keep_dist <- keep_dist
    obj <- do.call(manydist::lovo_mdist, args = args)

  } else {
    stop("Unknown mdist_type: ", mdist_type)
  }

  obj
}
Code
simulation_structure <- generated_datasets |>
  dplyr::mutate(
    loo_results = purrr::pmap(
      dplyr::pick(dataset, method, mdist_type, mdist_preset, param_set),
      \(dataset, method, mdist_type, mdist_preset, param_set) {
        pb$tick()
        X <- if (method == "baseline") dataset$Xorig else dataset$X
        run_lovo(X, mdist_type, mdist_preset, param_set,legacy_gower_sum = (method == "gower"))
      }
    )
  )


 save(file="simulation_structure.RData", simulation_structure)
Code
reference_method = "baseline"

baseline_refs = simulation_structure |>
  filter(method == "baseline") |>
  select(replicate, baseline_loo = loo_results)


rel_simulation_structure <- simulation_structure |>
  dplyr::left_join(baseline_refs, by = "replicate") |>
  dplyr::mutate(
    loo_results = purrr::map2(.x = loo_results,.y = baseline_loo,
        .f = \(res_obj, ref_obj) {
        res <- res_obj$results
        ref <- ref_obj$results

        # join by variable to be safe (order might differ)
        dplyr::left_join(
          res,
          dplyr::select(ref, variable,
                        ref_mad_normalized = mad_normalized,
                        ref_ac_importance  = ac_importance),
          by = "variable"
        ) |>
          dplyr::mutate(
            mad_relative_importance = mad_normalized / ref_mad_normalized,
            ac_relative_importance  = ac_importance  / ref_ac_importance
          )
      }
    )
  ) |>
  select(-baseline_loo)

results_long <- rel_simulation_structure |>
  tidyr::unnest(loo_results) |>
  # enforce one value per replicate (if already unique, no change)
  dplyr::group_by(replicate, variable, method) |>
  dplyr::summarise(
    dplyr::across(
      c(mad_importance, cc_importance, ac_importance,
        mad_normalized, mad_relative_importance, ac_relative_importance),
      ~ mean(.x, na.rm = TRUE)
    ),
    .groups = "drop"
  )

results_summary <- results_long |>
  dplyr::group_by(variable, method) |>
  dplyr::summarise(
    dplyr::across(
      c(mad_importance, cc_importance, ac_importance,
        mad_normalized, mad_relative_importance, ac_relative_importance),
      list(
        mean = \(x) mean(x, na.rm = TRUE),
        sd   = \(x) sd(x,   na.rm = TRUE)
      ),
      .names = "{.col}_{.fn}"
    ),
    .groups = "drop"
  )

results_wide <- results_summary |>
  tidyr::pivot_wider(
    names_from = method,
    values_from = -c(variable, method)
  )

measure_names <- c(
  "mad_importance",
  "cc_importance",
  "ac_importance",
  "mad_normalized",
  "mad_relative_importance",
  "ac_relative_importance"
)

split_by_measure_mean <- purrr::map(
  measure_names,
  \(measure) {
    results_wide |>
      dplyr::select(variable, dplyr::starts_with(paste0(measure, "_mean_"))) |>
      dplyr::rename_with(
        ~ stringr::str_remove(., paste0(measure, "_mean_")),
        -variable
      )
  }
) |> rlang::set_names(measure_names)

split_by_measure_sd <- purrr::map(
  measure_names,
  \(measure) {
    results_wide |>
      dplyr::select(variable, dplyr::starts_with(paste0(measure, "_sd_"))) |>
      dplyr::rename_with(
        ~ stringr::str_remove(., paste0(measure, "_sd_")),
        -variable
      )
  }
) |> rlang::set_names(measure_names)  

Code for Figure 1

Code
meas_to_plot_mad_mean <- split_by_measure_mean |> pluck("mad_importance") |> 
  mutate(measure="mad_importance") |> slice(3:6,1,2) |> mutate(var_id=1:6)

meas_to_plot_mad_sd <- split_by_measure_sd |> pluck("mad_importance") |> 
  mutate(measure="mad_importance") |> slice(3:6,1,2) |> mutate(var_id=1:6)

mad_max <- max(meas_to_plot_mad_mean |> select(where(is.numeric), -var_id), na.rm = TRUE)

rect_data <- tibble(
  xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max,
  xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max
)  

meas_to_plot_mad_rel_mean <- split_by_measure_mean |> pluck("mad_normalized") |> 
  mutate(measure="mad_normalized") |> slice(3:6,1,2) |> mutate(var_id=1:6)

meas_to_plot_mad_rel_sd <- split_by_measure_sd |> pluck("mad_normalized") |> 
  mutate(measure="mad_normalized") |> slice(3:6,1,2) |> mutate(var_id=1:6)

mad_max_rel <- max(meas_to_plot_mad_rel_mean |> select(where(is.numeric), -var_id), na.rm = TRUE)

rect_data_rel <- tibble(
  xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max_rel,
  xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max_rel
)
  
loo_mad_plot <- meas_to_plot_mad_mean |> 
   canon_cols() |> 
  dplyr::select(variable, var_id, any_of(method_levels)) |> 
  pivot_longer(cols = naive:u_mix, names_to="method", values_to="value") |> 
  mutate(
    method = factor(method, levels = method_levels),
    additive = ifelse(method %in% c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"),
                      "additive distances", "non-additive distances")
  ) |>
  dplyr::left_join(
    meas_to_plot_mad_sd |>
       canon_cols() |>
      dplyr::select(variable, var_id, any_of(method_levels)) |>
      pivot_longer(cols = naive:u_mix, names_to="method", values_to="sd") |>
      mutate(method = factor(method, levels = method_levels)),
    by = c("variable","var_id","method")
  ) |>
  ggplot(aes(x = var_id, y = value, color = method, shape = method)) +
  geom_rect(data=rect_data,
            aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha=.2, fill="indianred", color="lightgrey", inherit.aes=FALSE) +
  geom_rect(data=rect_data,
            aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha=.2, fill="dodgerblue", color="lightgrey", inherit.aes=FALSE) +
  geom_errorbar(aes(ymin = value - sd, ymax = value + sd), width = 0.12, linewidth = 0.35) +
  geom_point() + geom_line() +
  ylab("absolute variable contribution to distance") +
  xlab("") + theme_bw() + theme(legend.position = "none") +
  scale_x_continuous(breaks=seq(1,6,1), labels = rep("",6)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  scale_shape_manual(values = my_shapes) +
  scale_color_manual(values = my_colors) +
  facet_wrap(.~additive)

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")

nonadd_methods <- c("naive","hl","gudmm","dkps")

# add_methods <- c("gower", "mod_gower", "hl_add", "u_ind", "u_dep", "u_mix")

legend_breaks <- c(
  c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"),  # additive first
  c("naive","hl","gudmm","dkps")                           # non-additive after
)

legend_labels <- setNames(
  ifelse(legend_breaks %in% c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"),
         paste0(legend_breaks, " (additive)"),
         paste0(legend_breaks, " (non-additive)")),
  legend_breaks
)

library(ggnewscale)

loo_mad_rel_df <- meas_to_plot_mad_rel_mean |>
   canon_cols() |>
  dplyr::select(variable, var_id, any_of(method_levels)) |>
  tidyr::pivot_longer(cols = naive:u_mix, names_to="method", values_to="value") |>
  mutate(
    method = factor(method, levels = c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix")),
    additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive")
  ) |>
  dplyr::left_join(
    meas_to_plot_mad_rel_sd |>
       canon_cols() |>
      dplyr::select(variable, var_id, any_of(method_levels)) |>
      tidyr::pivot_longer(cols = naive:u_mix, names_to="method", values_to="sd") |>
      mutate(method = factor(method, levels = c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix"))),
    by = c("variable","var_id","method")
  )

loo_mad_rel_plot <-
  ggplot() +
  geom_rect(data=rect_data_rel,
            aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha=.2, fill="indianred", color="lightgrey", inherit.aes=FALSE) +
  geom_rect(data=rect_data_rel,
            aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha=.2, fill="dodgerblue", color="lightgrey", inherit.aes=FALSE) +

  # --- Additive layer + legend 1 ---
  # geom_errorbar(
  #   data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
  #   aes(x = var_id, ymin = value - sd, ymax = value + sd, color = method),
  #   width = 0.12, linewidth = 0.35
  # ) +
  geom_line(
    data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
    aes(x = var_id, y = value, color = method)
  ) +
  geom_point(
    data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
    aes(x = var_id, y = value, color = method, shape = method)
  ) +
  scale_color_manual(
    name   = "additive",
    values = my_colors,
    breaks = add_methods
  ) +
  scale_shape_manual(
    name   = "additive",
    values = my_shapes,
    breaks = add_methods
  ) +
  guides(
  color = guide_legend(
    order = 1,
    override.aes = list(shape = my_shapes[add_methods])
  ),
  shape = "none"
) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +

  # --- Non-additive layer + legend 2 ---
  # geom_errorbar(
  #   data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
  #   aes(x = var_id, ymin = value - sd, ymax = value + sd, color = method),
  #   width = 0.12, linewidth = 0.35
  # ) +
  geom_line(
    data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method)
  ) +
  geom_point(
    data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method, shape = method)
  ) +
  scale_color_manual(
    name   = "non-additive",
    values = my_colors,
    breaks = nonadd_methods
  ) +
  scale_shape_manual(
    name   = "non-additive",
    values = my_shapes,
    breaks = nonadd_methods
  ) +
  guides(
  color = guide_legend(
    order = 2,
    override.aes = list(shape = my_shapes[nonadd_methods])
  ),
  shape = "none"
) +

  # cosmetics (same as yours)
  ylab("relative variable contribution to distance") +
  xlab("left-out variable") + theme_bw() +
  scale_x_continuous(
    breaks=seq(1,6,1),
    labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2")
  ) +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    strip.text = element_blank(),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
    )+facet_wrap(.~additive)

fig_1_plot = loo_mad_plot / loo_mad_rel_plot

fig_1_plot

Code for Figure 2

Code
library(ggnewscale)

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")

meas_to_plot_ac_mean <- split_by_measure_mean |> pluck("ac_relative_importance") |> 
  mutate(measure="ac_relative_importance") |> slice(3:6,1,2) |> mutate(var_id=1:6)

meas_to_plot_ac_sd <- split_by_measure_sd |> pluck("ac_relative_importance") |> 
  mutate(measure="ac_relative_importance") |> slice(3:6,1,2) |> mutate(var_id=1:6)

ac_max <- max(meas_to_plot_ac_mean |> select(where(is.numeric), -var_id), na.rm = TRUE)

rect_data_ac <- tibble(
  xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=ac_max,
  xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=ac_max
)

ac_df <- meas_to_plot_ac_mean |>
   canon_cols() |>
  dplyr::select(variable, var_id, any_of(method_levels)) |>
  tidyr::pivot_longer(cols = naive:u_mix, names_to="method", values_to="value") |>
  mutate(
    method = factor(method, levels = c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix")),
    additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive")
  ) |>
  dplyr::left_join(
    meas_to_plot_ac_sd |>
       canon_cols() |>
      dplyr::select(variable, var_id, any_of(method_levels)) |>
      tidyr::pivot_longer(cols = naive:u_mix, names_to="method", values_to="sd") |>
      mutate(method = factor(method, levels = c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix"))),
    by = c("variable","var_id","method")
  )

ac_plot <-
  ggplot() +
  geom_rect(data=rect_data_ac,
            aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha=.2, fill="indianred", color="lightgrey", inherit.aes=FALSE) +
  geom_rect(data=rect_data_ac,
            aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha=.2, fill="dodgerblue", color="lightgrey", inherit.aes=FALSE) +

  # --- Additive legend (left / first) ---
  geom_line(
    data = dplyr::filter(ac_df, additive == "additive"),
    aes(x = var_id, y = value, color = method)
  ) +
  geom_point(
    data = dplyr::filter(ac_df, additive == "additive"),
    aes(x = var_id, y = value, color = method, shape = method)
  ) +
  scale_color_manual(name = "additive", values = my_colors, breaks = add_methods) +
  scale_shape_manual(name = "additive", values = my_shapes, breaks = add_methods) +
  guides(
    color = guide_legend(order = 1, override.aes = list(shape = my_shapes[add_methods])),
    shape = "none"
  ) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +

  # --- Non-additive legend (right / second) ---
  geom_line(
    data = dplyr::filter(ac_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method)
  ) +
  geom_point(
    data = dplyr::filter(ac_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method, shape = method)
  ) +
  scale_color_manual(name = "non-additive", values = my_colors, breaks = nonadd_methods) +
  scale_shape_manual(name = "non-additive", values = my_shapes, breaks = nonadd_methods) +
  guides(
    color = guide_legend(order = 2, override.aes = list(shape = my_shapes[nonadd_methods])),
    shape = "none"
  ) +

  # cosmetics
  ylab("relative alienation coefficient") +
  xlab("left-out variable") +
  scale_x_continuous(
    breaks=seq(1,6,1),
    labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2")
  ) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  )+  facet_wrap(.~additive)

ac_plot

Retrieval of the true configuration

Data generation

Code
recovery_structure <- tidyr::crossing(
  tibble::tibble(replicate = 1:reps),
  method = methods,
  categories = qoptions
) |>
  dplyr::left_join(manydist:::mdist_method_lookup, by = "method") |>
  dplyr::mutate(
    dataset = purrr::map2(
      .x = replicate, .y = categories,
      ~ generate_dataset(
        n = n, porig = porig, pn = pn,
        pnnoise = pnnoise, pcnoise = pcnoise,
        sigma = sigma, qoptions = .y, mode = "shared",
        seed = 1234 + .x
      )
    ),
    # compute distance matrix using method specs + baseline uses Xorig
    distance_matrix = purrr::pmap(
      dplyr::pick(dataset, method, mdist_type, mdist_preset, param_set),
      \(dataset, method, mdist_type, mdist_preset, param_set) {
        X <- if (method == "baseline") dataset$Xorig else dataset$X

        run_mdist_within(
          df = X,
          mdist_type = mdist_type,
          mdist_preset = mdist_preset,
          param_set = param_set,
          outcome = NULL   # because X / Xorig have no response column
        ) |>
          base::as.matrix()
      }
    ),

    mds_results = purrr::map(
      distance_matrix,
      ~ cmdscale(.x, eig = TRUE, k = 2)$points |>
        as.data.frame() |>
        setNames(c("x1", "x2"))
    ),

    congruence_coeff = purrr::map2(
      .x = mds_results, .y = dataset,
      \(mds, ds) congruence_coeff(ds$truth, mds)
    )
  )

# save(file="recovery_structure.RData", recovery_structure)
recovery_structure_lite = recovery_structure |>
  select(-dataset, -distance_matrix, -mds_results) |>
  unnest(congruence_coeff) |>
  select(replicate, method, categories, congruence_coeff) |> 
  mutate(categories=factor(paste0(categories," categories")),
         alienation_coeff = sqrt(1-congruence_coeff^2))

save(file="recovery_structure_lite.RData", recovery_structure_lite)

Code for figure Figure 3

Code
library(dplyr)
library(ggplot2)
library(ggnewscale)

# n_add   <- length(add_methods)

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")
method_order <- c(add_methods, nonadd_methods, "baseline")

n_add    <- length(add_methods)
n_nonadd <- length(nonadd_methods)

n_total <- length(method_order)

rect_bg <- tibble::tibble(
  xmin = c(0.5, n_add + 0.5),
  xmax = c(n_add + 0.5, n_add + n_nonadd + 0.5),
  ymin = c(-Inf, -Inf),
  ymax = c( Inf,  Inf),
  grp  = c("additive", "non-additive")
)
# --- method groups ---


# x-axis order: additive first, then non-additive, baseline last

# --- prepare data ---
box_df <- recovery_structure_lite |>
  mutate(
    method = ifelse(is.na(method), "baseline", method),
    method = canon_method(method),  # paper naming
    method = factor(method, levels = method_order),
    additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive")
  )

# --- plot ---
boxplot_figure_simu <-
  ggplot(box_df, aes(x = method, y = alienation_coeff)) +
  geom_rect(
  data = dplyr::filter(rect_bg, grp == "additive"),
  aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
  inherit.aes = FALSE,
  alpha = 0.20, fill = "dodgerblue", color = "lightgrey"
) +
geom_rect(
  data = dplyr::filter(rect_bg, grp == "non-additive"),
  aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
  inherit.aes = FALSE,
  alpha = 0.20, fill = "indianred", color = "lightgrey"
) +
  facet_wrap(~ categories) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  ylab("alienation coefficient") +
  xlab("methods") +

  # --- additive boxes + legend 1 (left) ---
  geom_boxplot(
    data = dplyr::filter(box_df, additive == "additive"),
    aes(fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
  scale_fill_manual(
    name   = "additive",
    values = my_colors,
    breaks = add_methods
  ) +
  guides(fill = guide_legend(order = 1)) +

  ggnewscale::new_scale_fill() +

  # --- non-additive boxes (incl baseline) + legend 2 (right) ---
  geom_boxplot(
    data = dplyr::filter(box_df, additive == "non-additive"),
    aes(fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
  scale_fill_manual(
    name   = "non-additive",
    values = my_colors,
    breaks = c(nonadd_methods, "baseline")  # baseline included here; appears last in x-axis
  ) +
  guides(fill = guide_legend(order = 2))

boxplot_figure_simu

Clustering experiment: partitioning around medoids

numsep =.1 and catsep=0.35 generation

Code
nrep <- 100
k_true <- 4
q <- 9
numsep <- 0.1
catsep <- 0.35
clustSizeEq <- 50

param_generator_grid <- tibble::tibble(
  numsignal = c(0,0,4,4,4,8,8,8),
  numnoise  = c(8,8,4,4,4,0,0,0),
  catsignal = c(4,8,0,4,8,0,4,8),
  catnoise  = c(4,0,8,4,0,8,4,0)
) |>
  tidyr::crossing(replicate = 1:nrep)

mixed_datasets_grid <- param_generator_grid |>
  dplyr::mutate(
    data = purrr::pmap(
      dplyr::pick(numsignal, catsignal, numnoise, catnoise, replicate),
      \(numsignal, catsignal, numnoise, catnoise, replicate) {
        gen_mixed(
          k_true      = k_true,
          clustSizeEq = clustSizeEq,
          numsignal   = numsignal,
          numnoise    = numnoise,
          catsignal   = catsignal,
          catnoise    = catnoise,
          q           = q,
          q_err       = q,
          numsep      = numsep,
          catsep      = catsep,
          seed        = replicate
        )
      }
    )
  )

methods <- c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")

sim_methods_grid <- mixed_datasets_grid |>
  tidyr::crossing(method = methods) |>
  dplyr::left_join(manydist:::mdist_method_lookup, by = "method")

stopifnot(!any(is.na(sim_methods_grid$mdist_type)))

ari_pam_results_01_035 <- sim_methods_grid |>
  dplyr::mutate(
    ari = purrr::pmap_dbl(
      dplyr::pick(data, mdist_type, mdist_preset, param_set),
      \(data, mdist_type, mdist_preset, param_set) {
        df <- data$df
        D <- run_mdist_within(df, mdist_type, mdist_preset, param_set, outcome = "y")
        pam_fit <- cluster::pam(D, k = k_true, diss = TRUE)
        mclust::adjustedRandIndex(df$y, pam_fit$clustering)
      }
    )
  )
Code
# Method groups (as you provided)
add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")
method_levels  <- c(add_methods, nonadd_methods)

# Facet order (row-wise)
scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080")

ari_plot_df_01_035 <- ari_pam_results_01_035 |>
  mutate(
    # Used only to match facet order; preserves leading zeros
    scenario_id = sprintf("%01d%01d%01d%01d", numsignal, numnoise, catsignal, catnoise),

    scenario = str_glue(
      "num: {numsignal} s. + {numnoise} n. | cat: {catsignal} s. + {catnoise} n."
    ),
method = canon_method(method),
    # Row-wise facet order as provided
    scenario = factor(scenario, levels = scenario[match(scenario_order, scenario_id)]),

    # Method order
    method = factor(method, levels = method_levels),

    # Additive / Non-additive label
    method_type = if_else(as.character(method) %in% add_methods, "Additive", "Non-additive")
  )

# Background bands (no fill scale consumed)
k_add <- length(add_methods)
k_tot <- length(method_levels)

bg_add <- tibble(
  xmin = 0.5, xmax = k_add + 0.5,
  ymin = -Inf, ymax = Inf
)
bg_nonadd <- tibble(
  xmin = k_add + 0.5, xmax = k_tot + 0.5,
  ymin = -Inf, ymax = Inf
)

ari_plot_01_035 <- ggplot() +
  # Shaded regions
  geom_rect(
    data = bg_add,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    inherit.aes = FALSE,
    fill = "indianred",
    alpha = 0.12,
    show.legend = FALSE
  ) +
  geom_rect(
    data = bg_nonadd,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    inherit.aes = FALSE,
    fill = "dodgerblue",
    alpha = 0.12,
    show.legend = FALSE
  ) +

  # Additive boxplots + legend block
  geom_boxplot(
    data = ari_plot_df_01_035 |> filter(method_type == "Additive"),
    aes(x = method, y = ari, fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
  # Additive legend (comes first)
scale_fill_manual(
  name   = "Additive",
  values = my_colors[add_methods],
  breaks = add_methods,
  drop   = FALSE,
  guide  = guide_legend(order = 1)
) +

  ggnewscale::new_scale_fill() +

  # Non-additive boxplots + legend block
  geom_boxplot(
    data = ari_plot_df_01_035 |> filter(method_type == "Non-additive"),
    aes(x = method, y = ari, fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
 # Non-additive legend (comes second)
scale_fill_manual(
  name   = "Non-additive",
  values = my_colors[nonadd_methods],
  breaks = nonadd_methods,
  drop   = FALSE,
  guide  = guide_legend(order = 2)
) +
  facet_wrap(~ scenario, ncol = 2) +
  coord_cartesian(ylim = c(0, 1)) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    strip.text  = element_text(size = 9),
    legend.text = element_text(size = 13)
  ) +
  labs(x = "", y = "ARI (PAM on dissimilarities)")

print(ari_plot_01_035)

Code
# ---- method groups ----
add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")

# ---- scenario order (shared with boxplot figure) ----
scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080")

ari_parallel_df <- ari_pam_results_01_035 |>
  mutate(
    # unify method naming
    method = canon_method(method), 

    # key used for ordering
    scenario_id = sprintf(
      "%01d%01d%01d%01d",
      numsignal, numnoise, catsignal, catnoise
    ),

    # label shown on the x-axis
    scenario = stringr::str_glue(
      "n:({numsignal},{numnoise}); c:({catsignal},{catnoise})"
    )
  ) |>
  group_by(scenario_id, scenario, method) |>
  summarise(
    mean_ari = mean(ari, na.rm = TRUE),
    sd_ari   = sd(ari,   na.rm = TRUE),
    .groups  = "drop"
  ) |>
  mutate(
    # enforce scenario order explicitly
    scenario = factor(
      scenario,
      levels = scenario[match(scenario_order, scenario_id)]
    ),

    type = ifelse(method %in% add_methods, "additive", "non-additive"),
    method = factor(method, levels = c(add_methods, nonadd_methods))
  )

Code for Figure 4

Code
ari_parallel_plot <-
  ggplot() +

  # ================= ADDITIVE =================
  geom_errorbar(
    data = dplyr::filter(ari_parallel_df, type == "additive"),
    aes(
      x = scenario,
      ymin = mean_ari - sd_ari,
      ymax = mean_ari + sd_ari,
      color = method
    ),
    width = 0.12,
    linewidth = 0.35
  ) +
  geom_line(
    data = dplyr::filter(ari_parallel_df, type == "additive"),
    aes(x = scenario, y = mean_ari, color = method, group = method),
    linewidth = 0.6,alpha=.75
  ) +
  geom_point(
    data = dplyr::filter(ari_parallel_df, type == "additive"),
    aes(x = scenario, y = mean_ari, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(name = "additive", values = my_colors, breaks = add_methods) +
  scale_shape_manual(name = "additive", values = my_shapes, breaks = add_methods) +
  guides(
    color = guide_legend(order = 1, override.aes = list(shape = my_shapes[add_methods])),
    shape = "none"
  ) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +

  # ================= NON-ADDITIVE =================
  geom_errorbar(
    data = dplyr::filter(ari_parallel_df, type == "non-additive"),
    aes(
      x = scenario,
      ymin = mean_ari - sd_ari,
      ymax = mean_ari + sd_ari,
      color = method
    ),
    width = 0.12,
    linewidth = 0.35
  ) +
  geom_line(
    data = dplyr::filter(ari_parallel_df, type == "non-additive"),
    aes(x = scenario, y = mean_ari, color = method, group = method),
    linewidth = 0.6,alpha=.75
  ) +
  geom_point(
    data = dplyr::filter(ari_parallel_df, type == "non-additive"),
    aes(x = scenario, y = mean_ari, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(name = "non-additive", values = my_colors, breaks = nonadd_methods) +
  scale_shape_manual(name = "non-additive", values = my_shapes, breaks = nonadd_methods) +
  guides(
    color = guide_legend(order = 2, override.aes = list(shape = my_shapes[nonadd_methods])),
    shape = "none"
  ) +

  labs(x = "scenario", y = "mean ARI (± 1 SD)") +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 30, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  )

ari_parallel_plot

Code
ggsave(
  filename = "figure_parallel_ari.pdf",
  plot     = ari_parallel_plot,
  width    = 18,
  height   = 11,
  units    = "cm"
)
Code
load(file = "ari_pam_experiment_numsep_01_catsep_035.RData")

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")

scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080")

ari_rank_df <- ari_pam_results_01_035 |>
  mutate(
    method = canon_method(method),

    scenario_id = sprintf("%01d%01d%01d%01d", numsignal, numnoise, catsignal, catnoise),
    scenario = stringr::str_glue("n:({numsignal},{numnoise}); c:({catsignal},{catnoise})")
  ) |>
  group_by(scenario_id, scenario, method) |>
  summarise(
    mean_ari = mean(ari, na.rm = TRUE),
    .groups  = "drop"
  ) |>
  group_by(scenario_id, scenario) |>
  mutate(
    rank_desc = dplyr::dense_rank(dplyr::desc(mean_ari))  # 1 = best
  ) |>
  ungroup() |>
  mutate(
    # enforce scenario order (same as boxplot panels)
    scenario = factor(scenario, levels = scenario[match(scenario_order, scenario_id)]),

    type = ifelse(method %in% add_methods, "additive", "non-additive"),
    method = factor(method, levels = c(add_methods, nonadd_methods))
  )

Illustration: FIFA player data

Data preparation and analysis

Code
# Variables are organized: Categorical first, then numerical

data("fifa_nl", package = "manydist")
#Xorig<-X<-ddata[,-c(1,2)]
X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories)
# vars 19:24 interesting but same scale and little variation
# leaving them out makes the plots a bit smaller.
# leaving them in doesn't seem to change a lot
n<-nrow(X)
nd<-2 # Dimensionality of MDS solution
X=X[,c(8:1,9:16)]  # Order
X=X[,-3]  # Remove int rep: 380 have something, 21+7 in other cats

X=X[,c(1:12,14,13,15)]
p<-ncol(X)
X[8:15] <- lapply(X[8:15], as.numeric)
fifa_clean <- as_tibble(X)  |> 
  select(where(is.integer), where(is.numeric),where(is.factor) ,-release_clause_eur) |> 
  rename(pref_foot=`preferred_foot`,
         club=`club_name`,
         position=`team_position`)
Code
fifa_structure <- tibble::tibble(
  method = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")
) |>
  dplyr::left_join(manydist:::mdist_method_lookup, by = "method") |>
  dplyr::mutate(
    loo_results = purrr::pmap(
      dplyr::pick(method, mdist_type, mdist_preset, param_set),
      \(method, mdist_type, mdist_preset, param_set) {
        run_lovo(
          df = fifa_clean,
          mdist_type = mdist_type,
          mdist_preset = mdist_preset,
          param_set = param_set,
          outcome = NULL,
          dims = nd,
          keep_dist = FALSE,
          legacy_gower_sum = (method == "gower")  # triggers only for preset+gower anyway
        )
      }
    )
  )
Code
results_fifa_wide <- fifa_structure |>
  dplyr::mutate(loo_tbl = purrr::map(loo_results, "results")) |>
  dplyr::select(method, loo_tbl) |>
  tidyr::unnest(loo_tbl) |>
  tidyr::pivot_wider(
    id_cols    = variable,
    names_from = method,
    values_from = c(mad_importance, cc_importance, ac_importance, mad_normalized)
  )


measure_names <- c(
  "mad_importance",
  "cc_importance",
  "ac_importance",
  "mad_normalized"
)

fifa_split_by_measure <- purrr::map(
  measure_names,
  \(measure) {
    results_fifa_wide |>
      dplyr::select(variable, dplyr::starts_with(paste0(measure, "_"))) |>
      dplyr::rename_with(~ stringr::str_remove(.x, paste0(measure, "_")))
  }
) |>
  rlang::set_names(measure_names)

Code for Figure 5

Code
meas_to_plot_mad = fifa_split_by_measure |> pluck("mad_importance") |>
  mutate(measure="mad_importances") |> slice(c(8:14,1:7)) |>  mutate(var_id=1:n())


mad_max = max(meas_to_plot_mad |> select(where(is.numeric),-var_id))

rect_data = tibble(
  xmin_cont=8.5,xmax_cont=14,ymin_cont=0,ymax_cont=mad_max,
  xmin_cat=.5,xmax_cat=8.5,ymin_cat=0,ymax_cat=mad_max)

meas_to_plot_mad_rel = fifa_split_by_measure |> pluck("mad_normalized") |>
  mutate(measure="mad_normalized")|> slice(c(8:14,1:7)) |> mutate(var_id=1:n())

mad_max_rel = max(meas_to_plot_mad_rel |> select(where(is.numeric),-var_id))

rect_data_rel = tibble(
  xmin_cont=8.5,xmax_cont=14,ymin_cont=0,ymax_cont=mad_max_rel,
  xmin_cat=.5,xmax_cat=8.5,ymin_cat=0,ymax_cat=mad_max_rel)

# --- DROP-IN: Figure 4 (MAD importances) with split legend (shown only on bottom plot) ---


method_levels <- c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix")
additive_set  <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
conflict_prefer("intersect", "base")
conflict_prefer("setdiff", "base")
add_present    <- intersect(additive_set, method_levels)
nonadd_present <- setdiff(method_levels, additive_set)

# ---------- TOP: absolute contributions ----------
loo_mad_long <- meas_to_plot_mad |>
   canon_cols() |>
  pivot_longer(
    cols = any_of(method_levels),
    names_to = "method",
    values_to = "value"
  ) |>
  mutate(
    method   = factor(method, levels = method_levels),
    additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"),
    type     = ifelse(method %in% additive_set, "Additive", "Non-additive")
  )

loo_mad_plot <-
  ggplot() +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +
  # Additive
  geom_line(
    data = filter(loo_mad_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Additive",
    values = my_colors,
    breaks = add_present,
    drop   = FALSE,
    guide  = guide_legend(order = 1, override.aes = list(shape = my_shapes[add_present]))
  ) +
  scale_shape_manual(
    name   = "Additive",
    values = my_shapes,
    breaks = add_present,
    drop   = FALSE
  ) +
  guides(shape = "none") +
  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  # Non-additive
  geom_line(
    data = filter(loo_mad_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Non-additive",
    values = my_colors,
    breaks = nonadd_present,
    drop   = FALSE,
    guide  = guide_legend(order = 2, override.aes = list(shape = my_shapes[nonadd_present]))
  ) +
  scale_shape_manual(
    name   = "Non-additive",
    values = my_shapes,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  guides(shape = "none") +
  ylab("absolute variable contribution to distance") +
  theme_bw() +
  scale_x_continuous(breaks = seq(1, 14, 1), labels = rep("", 14)) +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    legend.position = "none"   # <-- keep legend hidden on TOP plot
  ) +
  xlab("") +
  facet_wrap(. ~ additive)

# ---------- BOTTOM: relative contributions ----------
loo_mad_rel_long <- meas_to_plot_mad_rel |>
   canon_cols() |>
  pivot_longer(
    cols = any_of(method_levels),
    names_to = "method",
    values_to = "value"
  ) |>
  mutate(
    method   = factor(method, levels = method_levels),
    additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"),
    type     = ifelse(method %in% additive_set, "Additive", "Non-additive")
  )

loo_mad_rel_plot <-
  ggplot() +
  geom_rect(
    data = rect_data_rel,
    aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  geom_rect(
    data = rect_data_rel,
    aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +
  # Additive
  geom_line(
    data = filter(loo_mad_rel_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_rel_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Additive",
    values = my_colors,
    breaks = add_present,
    drop   = FALSE,
    guide  = guide_legend(order = 1, override.aes = list(shape = my_shapes[add_present]))
  ) +
  scale_shape_manual(
    name   = "Additive",
    values = my_shapes,
    breaks = add_present,
    drop   = FALSE
  ) +
  guides(shape = "none") +
  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  # Non-additive
  geom_line(
    data = filter(loo_mad_rel_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_rel_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Non-additive",
    values = my_colors,
    breaks = nonadd_present,
    drop   = FALSE,
    guide  = guide_legend(order = 2, override.aes = list(shape = my_shapes[nonadd_present]))
  ) +
  scale_shape_manual(
    name   = "Non-additive",
    values = my_shapes,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  guides(shape = "none") +
  ylab("relative variable contribution to distance") +
  theme_bw() +
  scale_x_continuous(breaks = seq(1, 14, 1), labels = meas_to_plot_mad_rel$variable) +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    strip.text  = element_blank(),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  xlab("") +
  facet_wrap(. ~ additive)

# ---------- COMBINE + SAVE ----------
fig_4_plot <- loo_mad_plot / loo_mad_rel_plot
fig_4_plot

Code for Figure 6

Code
method_levels <- c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix")
additive_set  <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")

meas_to_plot_ac <- fifa_split_by_measure |>
  purrr::pluck("ac_importance") |>
  dplyr::mutate(measure = "ac_importance") |>
  dplyr::slice(c(8:14, 1:7)) |>
  dplyr::mutate(var_id = dplyr::row_number())

max_ac <- meas_to_plot_ac |>
  dplyr::select(where(is.numeric), -var_id) |>
  unlist(use.names = FALSE) |>
  max(na.rm = TRUE)

rect_data_ac <- tibble::tibble(
  xmin_cont = 7.5, xmax_cont = 14,  ymin_cont = 0, ymax_cont = max_ac,
  xmin_cat  = 0.5, xmax_cat  = 7.5, ymin_cat  = 0, ymax_cat  = max_ac
)

ac_long <- meas_to_plot_ac |>
  canon_cols() |>
  tidyr::pivot_longer(
    cols = dplyr::any_of(method_levels),
    names_to = "method",
    values_to = "value"
  ) |>
  dplyr::mutate(
    method   = factor(method, levels = method_levels),
    additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"),
    type     = ifelse(method %in% additive_set, "Additive", "Non-additive")
  )

add_present    <- intersect(additive_set, unique(as.character(ac_long$method)))
nonadd_present <- intersect(setdiff(method_levels, additive_set), unique(as.character(ac_long$method)))

ac_plot <-
  ggplot2::ggplot() +
  ggplot2::geom_rect(
    data = rect_data_ac,
    ggplot2::aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  ggplot2::geom_rect(
    data = rect_data_ac,
    ggplot2::aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +

  # ================= ADDITIVE =================
  ggplot2::geom_line(
    data = dplyr::filter(ac_long, type == "Additive"),
    ggplot2::aes(x = var_id, y = value, color = method, group = method),
    alpha = 0.85
  ) +
  ggplot2::geom_point(
    data = dplyr::filter(ac_long, type == "Additive"),
    ggplot2::aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  ggplot2::scale_color_manual(
    name   = "Additive",
    values = my_colors,
    breaks = add_present,
    drop   = FALSE,
    guide  = ggplot2::guide_legend(order = 1, override.aes = list(shape = my_shapes[add_present]))
  ) +
  ggplot2::scale_shape_manual(
    name   = "Additive",
    values = my_shapes,
    breaks = add_present,
    drop   = FALSE
  ) +
  ggplot2::guides(shape = "none") +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +

  # ================= NON-ADDITIVE =================
  ggplot2::geom_line(
    data = dplyr::filter(ac_long, type == "Non-additive"),
    ggplot2::aes(x = var_id, y = value, color = method, group = method),
    alpha = 0.85
  ) +
  ggplot2::geom_point(
    data = dplyr::filter(ac_long, type == "Non-additive"),
    ggplot2::aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  ggplot2::scale_color_manual(
    name   = "Non-additive",
    values = my_colors,
    breaks = nonadd_present,
    drop   = FALSE,
    guide  = ggplot2::guide_legend(order = 2, override.aes = list(shape = my_shapes[nonadd_present]))
  ) +
  ggplot2::scale_shape_manual(
    name   = "Non-additive",
    values = my_shapes,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  ggplot2::guides(shape = "none") +

  ggplot2::ylab("alienation coefficient") +
  ggplot2::theme_bw() +
  ggplot2::scale_x_continuous(
    breaks = seq(1, 14, 1),
    labels = meas_to_plot_ac$variable
  ) +
  ggplot2::theme(
    axis.text.x = ggplot2::element_text(angle = 60, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  ggplot2::xlab("") +
  ggplot2::facet_wrap(. ~ additive)

ac_plot

Appendix D Categorical mean distances: skewed distributions

Code
LeHo<-function(X){  #  Returns the LeHo Distance (symmetric Kullback-Leibler)
                    #  X is matrix with conditional probabilities
#if(any(rowSums(X)!=1)){
#  print("Input must be conditional probs (row sums should be 1)")
#} else {
  
require(philentropy)

n<-nrow(X)
D<-matrix(0,n,n)
for (i in 1:(n-1)){
  for (j in (i+1):n){
    D[j,i]<-D[i,j]<-distance(X[c(i,j),],"kullback-leibler",unit="log2")+
      distance(X[c(j,i),],"kullback-leibler",unit="log2")}

}

return(D)
}

Data generation

Code
# Let's consider some specific unbalanced scenarios
# One probility is d. The others are (1-d)/(q-1)
# We consider skewed options:
dis<-c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)

#qi=5
qs<-c(2,3,5,10)#,18) # Some options for qi

nint<-length(dis)    # Number of options for d (nint-1)
rsm<-rsma<-rhl<-sdsm<-rse<-rsof<-rsiof<-sdse<-rsa<-sdsa<-sdsof<-sdsiof<-sddsma<-sdssm<-
  sddse<-sddsa<-sddsm<-sdhl<-sddsmca<-sddsof<-sddsiof<-sddshl<- sdshl<-
  rsmca<-sdsmca<-esm<-em<-ee<-ea<-ehl<-emca<-eof<-eiof<-matrix(NA,nrow=nint,ncol=length(qs))

#par(mfrow=c(2,2))
for (qis in 1:length(qs)){
  #ds<-NULL
  qi<-qs[qis]
  for (i in (1:nint)){ # variant 2

    Deltam<-matrix(1,qi,qi)    # Matching
    diag(Deltam)<-rep(0,qi)    # Matching Delta

    #d=i/(2*qi)  # i=2 corresponds to equal probs if variant 1 is chosen. (Only works for one qi. Not for qs loop)
    d=dis[i]  # variant 2

    #ds<-rbind(ds,d)
    p<-c(d,rep((1-d)/(qi-1),(qi-1)))
    sum(p)

    Dp<-diag(p)
    pp<-p %*% t(p)
    spi<-(diag(Dp-pp))^-1
    spi5<-(diag(Dp-pp))^-.5

    DEsk<-(2/(qi^2))*Deltam # Delta Eskin

    DOF<-log(p)%*%t(log(p)) * Deltam  # OF
    n=160 # For inv OF we need n. Soon gets very large (increases in n) 160 is sufficient
    #n<-ceiling(1/min(p))
    DIOF<-log(n*p)%*%t(log(n*p)) * Deltam  # inverse OF. If some np<1 we can get negatives

    n=160  # For HL we need to find a way to get somewhat consistent values. Let's choose n large
    marginals=p*n

    HLemp<-distancefactor(cat=qi,n=n,catsizes=marginals) #
    DHL<-Deltam*HLemp*2#*sqrt(2)  # NOTE 13/09: Corrected: Was sqrt(2). But, should be, see paper, 2x
    #DE<-sqrt((1/qi)*(spi %*% t(rep(1,qi)) + t(spi %*% t(rep(1,qi)))))
    #diag(DE)<-rep(0,qi)

    DM<-sqrt(1/qi)*(spi5 %*% t(rep(1,qi)) + t(spi5 %*% t(rep(1,qi))))
    diag(DM)<-rep(0,qi)

    DA<-(1/qi)*diag(spi5)%*%Deltam%*%diag(spi5)
    DMCA<-(1/qi) * diag(p^-.5)%*%Deltam%*%diag(p^-.5)

    #we<-ee[i,(qis)]<- t(p)%*%DE%*%p

    wsm<-esm[i,(qis)]<- t(p)%*%Deltam%*%p   # Simple Matching
    we<-ee[i,(qis)]<- t(p)%*%DEsk%*%p       # Eskin
    wof<-eof[i,(qis)]<-t(p)%*%DOF%*%p       # OF
    wiof<-eiof[i,(qis)]<-t(p)%*%DIOF%*%p    # IOF
    whl<-ehl[i,(qis)]<-t(p)%*%DHL%*%p       # HL
    wm<-em[i,(qis)]<- t(p)%*%DM%*%p         # SD scaling
    wa<-ea[i,(qis)]<- t(p)%*%DA%*%p         # Cat dis scaling
    wmca<-emca[i,(qis)]<- t(p)%*%DMCA%*%p   # MCA scaling

    # We can also estimate the variance among the distances using Deltas: E(X^2)-E(X)^2
    # We can do this dividing by expected values so that we look at what happens
    # when delta is commensurable. Or not
    # if not:

    #we<-wm<-wa<-wmca<-wof<-wiof<-whl<-wsm<-1

    sddse[i,(qis)]<- sqrt(t(p)%*%DEsk^2%*%p - ee[i,(qis)]^2)/we  # Eskin
    sddsma[i,(qis)]<- sqrt(t(p)%*%Deltam^2%*%p - esm[i,(qis)]^2)/ wsm  # Simple Matching
    sddshl[i,(qis)]<- sqrt(t(p)%*%DHL^2%*%p - ehl[i,(qis)]^2)/ whl  # HL
    sddsm[i,(qis)]<- sqrt(t(p)%*%DM^2%*%p - em[i,(qis)]^2)/ wm  # SD
    sddsa[i,(qis)]<- sqrt(t(p)%*%DA^2%*%p -  ea[i,(qis)]^2)/wa # Cat dissim
    sddsmca[i,(qis)]<- sqrt(t(p)%*%DMCA^2%*%p - emca[i,(qis)]^2)/wmca  # MCA
    sddsof[i,(qis)]<-sqrt(t(p)%*%DOF^2%*%p - eof[i,(qis)]^2)/wof  # OF
    sddsiof[i,(qis)]<-sqrt(t(p)%*%(DIOF/as.numeric(wiof))^2%*%p - 1 )  #IOF

    # Let's consider spread among cat dissimilarities. But: This only makes sense
    # when on similar scales. So, use commensurable deltas: (Or not: see above)
    sdse[i,qis]<-sd((1/we)*DEsk[upper.tri(DEsk)])  # Eskin
    sdsm[i,qis]<-sd((1/wm)*DM[upper.tri(DM)]) # SD
    sdshl[i,qis]<-sd((1/whl)*DHL[upper.tri(DHL)]) # HL
    sdssm[i,qis]<-sd((1/wsm)*Deltam[upper.tri(Deltam)]) # Simple Matching
    sdsa[i,qis]<-sd((1/wa)*DA[upper.tri(DA)]) #  Cat dissim,
    sdsmca[i,qis]<-sd((1/wmca)*DMCA[upper.tri(DMCA)]) # MCA
    sdsof[i,qis]<-sd((1/wof)*DOF[upper.tri(DOF)])  # OF
    sdsiof[i,qis]<-sd((1/wiof)*DIOF[upper.tri(DIOF)]) # IOF



  }

}

cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)

Code for Figure 7

Code
sk_distance_to_plot = function(exp_mean_dist,distance_name){
  exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
    rename(`2 cats` = V1,
           `3 cats` = V2,
           `5 cats` = V3,
           `10 cats` = V4) |>
    mutate(method=distance_name) |>
    pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
                 names_to = "Number of categories", values_to = "Mean distance")

  return(exp_mean_dist)
}

all_exp_mean_dist=rbind(sk_distance_to_plot(esm,"Simple Matching"),
                        sk_distance_to_plot(ee,"Eskin"),
                        sk_distance_to_plot(eof,"OF"),
                        sk_distance_to_plot(eiof,"IOF")
)

paper_plot_skew1=all_exp_mean_dist |>
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |>
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
  geom_line() + geom_point() + facet_wrap(~method,scales = "free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")


print(paper_plot_skew1,width=10,height=6)

Code for Figure 8

Code
all_exp_mean_dist2=rbind(sk_distance_to_plot(ehl,"Hennig-Liao"),
                         sk_distance_to_plot(em,"Std. dev."),
                         sk_distance_to_plot(ea,"Cat. dissim.")
)

paper_plot_skew2=all_exp_mean_dist2 |>
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |>
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
  geom_line() + geom_point() + facet_wrap(~method) + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")


print(paper_plot_skew2)

Code for Figure 9

Code
qs<-c(2,3,5,10)#,18) # Some options for qs

nint<-length(dis)    # Number of options for d (nint-1)

CVchd<-CVtvd<-CVmsd<-CVlhd<-Echi<-Echd<-Etvi<-Etvd<-Emsi<-Elhi<-Elhd<-Emsd<-array(NA,dim=c(length(qs),length(qs),nint,nint))

for (q1 in 1:length(qs)){  # cats row
  qi<-qs[q1]
  for (q2 in q1:length(qs)){ # cats cols
    qj<-qs[q2]
    for (pi in 1:nint){  # row distributions
      d1<-dis[pi]
      p1<-c(d1,rep((1-d1)/(qj-1),(qj-1)))  # nint different marginals: Distribution over colums
      for (pj in 1:nint){
        d2<-dis[pj]
        p2<-c(d2,rep((1-d2)/(qi-1),(qi-1)))  # corresponding col marginals: Distribution over rows
        Pi<-p2 %*% t(p1)   # joint table independent
        Ri<-Pi/rowSums(Pi) # conditional table independent
        #  Ci<-Ri/sqrt(p2)

        if (qj>qi) {
          Pd<-diag(p1[1:qi])
          Addzeros<-matrix(0,(qi-1),(qj-qi))
          Addps<-p1[qi+1:(qj-qi)]
          Add<-rbind(Addzeros,Addps)
          Pd<-cbind(Pd,Add)
        } else { #qi=qj
          pd<-p2
          Pd<-diag(pd)       # joint table dependent case (diagonal smallest q)
        }
        Rd<-Pd/rowSums(Pd)
        Cd<-Rd/sqrt(p2)    # Chi-square table: cond. prob/sqrt(p)

        # category dissimilarities:
        Dlhd<-LeHo(Rd)                   #  Le Ho
        # Dmsd<-MousaviSehhati(Pd)         # Moussavi
        Dtvd<-dist(Rd,"manhattan")/2     # TVD
        Dchd<-dist(Cd,"euclidean")^2     # Chi2 dist
        #    Dchd<-Dchd/(2*(qi-1))     # Chi2 dist corrected for number of categories...

        Elhd[q1,q2,pi,pj]<-t(pd) %*%Dlhd %*%pd
        # Emsd[q1,q2,pi,pj]<-t(pd) %*%Dmsd %*%pd
        Etvd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dtvd) %*%pd
        Echd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dchd) %*%pd

        # Coefficient of variation

        CVlhd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dlhd^2 %*%pd - (t(pd) %*%Dlhd %*%pd)^2)/(t(pd) %*%Dlhd %*%pd)
        # CVmsd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dmsd^2 %*%pd - (t(pd) %*%Dmsd %*%pd)^2)/(t(pd) %*%Dmsd %*%pd)
        CVtvd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dtvd))^2 %*%pd - (t(pd) %*%as.matrix(Dtvd) %*%pd)^2)/(t(pd) %*%as.matrix(Dtvd) %*%pd)
        CVchd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dchd))^2 %*%pd - (t(pd) %*%as.matrix(Dchd) %*%pd)^2)/(t(pd) %*%as.matrix(Dchd) %*%pd)

        Dlhi<-LeHo(Ri)
        # Dmsi<-MousaviSehhati(Pi)
        Dtvi<-dist(Ri,"manhattan")/2
        #  Dchi<-dist(Ci,"euclidean")^2

        Elhi[q1,q2,pi,pj]<-t(p2) %*%Dlhi %*%p2
        # Emsi[q1,q2,pi,pj]<-t(p2) %*%Dmsi %*%p2
        Etvi[q1,q2,pi,pj]<-t(p2) %*%as.matrix(Dtvi) %*%p2
        #  Echi[q1,q2,pi,pj]<-t(p1) %*%as.matrix(Dchi) %*%p1
      }
    }
  }
}



Elhs<-as.data.frame(rbind(Elhd[1,1,1,],Elhd[2,2,1,],Elhd[3,3,1,],Elhd[4,4,1,]))
Etvs<-as.data.frame(rbind(Etvd[1,1,1,],Etvd[2,2,1,],Etvd[3,3,1,],Etvd[4,4,1,]))
Emss<-as.data.frame(rbind(Emsd[1,1,1,],Emsd[2,2,1,],Emsd[3,3,1,],Emsd[4,4,1,]))
Echs<-as.data.frame(rbind(Echd[1,1,1,],Echd[2,2,1,],Echd[3,3,1,],Echd[4,4,1,]))

CVlhs<-as.data.frame(rbind(CVlhd[1,1,1,],CVlhd[2,2,1,],CVlhd[3,3,1,],CVlhd[4,4,1,]))
CVtvs<-as.data.frame(rbind(CVtvd[1,1,1,],CVtvd[2,2,1,],CVtvd[3,3,1,],CVtvd[4,4,1,]))
CVmss<-as.data.frame(rbind(CVmsd[1,1,1,],CVmsd[2,2,1,],CVmsd[3,3,1,],CVmsd[4,4,1,]))
CVchs<-as.data.frame(rbind(CVchd[1,1,1,],CVchd[2,2,1,],CVchd[3,3,1,],CVchd[4,4,1,]))

cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)


sk_distance_to_plot = function(exp_mean_dist,distance_name){
  exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
    rename(`2 cats` = V1,
           `3 cats` = V2,
           `5 cats` = V3,
           `10 cats` = V4) |>
    mutate(method=distance_name) |>
    pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
                 names_to = "Number of categories", values_to = "Mean distance")

  return(exp_mean_dist)
}


all_exp_mean_dist3=rbind(sk_distance_to_plot(t(as.matrix(Elhs)),"Le-Ho"),
                         sk_distance_to_plot(t(as.matrix(Etvs)),"Total Variance Distance")
)

paper_plot_skew3=all_exp_mean_dist3 |>
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |>
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
  geom_line() + geom_point() + facet_wrap(~method,scales="free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")


print(paper_plot_skew3,width=8,height=4)

Appendix E FIFA variable distributions

Code for Figure 10

Code
# load("data/Fifa21NL.RData") # Load the data
data(fifa_nl,package="manydist")
#Xorig<-X<-ddata[,-c(1,2)]
Xorig<-X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories)
# vars 19:24 interesting but same scale and little variation
# leaving them out makes the plots a bit smaller.
# leaving them in doesn't seem to change a lot


n<-nrow(X)
# p<-ncol(X)
K<-10  # Number of clusters in cluster analysis
nd<-2 # Dimensionality of MDS solution
X=X[,c(8:1,9:16)]  # Order
Xorig=Xorig[,c(8:1,9:16)]  # Order
X=X[,-3]  # Remove int rep: 380 have something, 21+7 in other cats
Xorig=Xorig[,-3]
X=X[,c(1:12,14,13,15)]
Xorig=Xorig[,c(1:12,14,13,15)]
p<-ncol(X)

# First convert integers to numeric manually
X[8:15] <- lapply(X[8:15], as.numeric)

# reordering / numeric first
X <- X %>%
  select(where(is.integer), where(is.numeric), where(is.factor))


X_tib_num = X |> as_tibble() |> select(where(is.numeric))

X_tib_cat = X |> as_tibble() |> select(where(is.factor))

levels(X_tib_cat$work_rate)=c("H/H", "H/L", "H/M", "L/H", "L/L", "L/M", "M/H", "M/L", "M/M")
levels(X_tib_cat$club_name) = c("ADO","Ajax","AZ","Emmen","Groningen","Twente","Utrecht",
                                "Feyenoord","Fortuna","Heracles","PEC","PSV","RKC","Heerenveen",
                                "Sparta","Vitesse","VVV","Willem II")

X_tib_cat=X_tib_cat |> rename(`club`=club_name,
                              `position`=team_position)

X_tib_num_nest =  tibble(values=X_tib_num |> map((~.x)),features=names(X_tib_num))|>
  mutate(
    hplot=map2(.x=values,.y=features,
               function(x=.x,y=.y){
                 my_tib=tibble(a=x)
                 names(my_tib) = y

                 my_plot= my_tib |> ggplot(aes(x)) + geom_histogram(bins=15,alpha=.75)+theme_bw()+ggtitle(y)+
                   theme(plot.title = element_text(hjust = 0.5))+xlab("")+ylab("")
               }
    )
  )


X_tib_cat_nest =  tibble(values=X_tib_cat |> map((~.x)),features=names(X_tib_cat))|>
  mutate(
    bplot=map2(.x=values,.y=features,
               function(x=.x,y=.y){
                 my_tib=tibble(a=x)
                 names(my_tib) = y

                 my_plot= my_tib |> ggplot(aes(x)) + geom_bar(alpha=.75)+theme_bw()+ggtitle(y)+
                   theme(plot.title = element_text(hjust = 0.5),
                         axis.text.x =element_text(angle=90 ))+xlab("")+ylab("")
               }
    )
  )


fifa_descriptive_plots=(X_tib_cat_nest$bplot[[1]]/
                          X_tib_cat_nest$bplot[[2]]/
                          X_tib_cat_nest$bplot[[3]]/
                          X_tib_cat_nest$bplot[[4]]/
                          X_tib_cat_nest$bplot[[5]]/
                          X_tib_cat_nest$bplot[[6]]/
                          X_tib_cat_nest$bplot[[7]])|
  (X_tib_num_nest$hplot[[1]]/
     X_tib_num_nest$hplot[[2]]/
     X_tib_num_nest$hplot[[3]]/
     X_tib_num_nest$hplot[[4]]/
     X_tib_num_nest$hplot[[5]]/
     X_tib_num_nest$hplot[[6]]/
     X_tib_num_nest$hplot[[7]])#+ plot_layout(guides = 'collect')


print(fifa_descriptive_plots)

References

Ghashti, J. S., and Thompson, J. R. J. (2024), “Mixed-type distance shrinkage and selection for clustering via kernel metric learning,” Journal of Classification. https://doi.org/https://doi.org/10.1007/s00357-024-09493-z.
Hennig, C., and Liao, T. F. (2013), How to Find an Appropriate Clustering for Mixed-Type Variables with Application to Socio-Economic Stratification,” Journal of the Royal Statistical Society Series C: Applied Statistics, 62, 309–369.
Liu, P., Yuan, H., Ning, Y., Chakraborty, B., Liu, N., and Peres, M. A. (2024), “A modified and weighted gower distance-based clustering analysis for mixed type data: A simulation and empirical analyses,” BMC Medical Research Methodology. https://doi.org/https://doi.org/10.1186/s12874-024-02427-8.
Mousavi, E., and Sehhati, M. (2023), “A generalized multi-aspect distance metric for mixed-type data clustering,” Pattern Recognition, 138, 109353.